1   package org.apache.lucene.analysis.ja.dict;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.BufferedReader;
21  import java.io.IOException;
22  import java.io.Reader;
23  import java.util.ArrayList;
24  import java.util.Collections;
25  import java.util.Comparator;
26  import java.util.List;
27  import java.util.Map;
28  import java.util.TreeMap;
29  
30  import org.apache.lucene.analysis.ja.util.CSVUtil;
31  import org.apache.lucene.util.IntsRef;
32  import org.apache.lucene.util.IntsRefBuilder;
33  import org.apache.lucene.util.fst.Builder;
34  import org.apache.lucene.util.fst.FST;
35  import org.apache.lucene.util.fst.PositiveIntOutputs;
36  
37  /**
38   * Class for building a User Dictionary.
39   * This class allows for custom segmentation of phrases.
40   */
41  public final class UserDictionary implements Dictionary {
42    
43    // phrase text -> phrase ID
44    private final TokenInfoFST fst;
45    
46    // holds wordid, length, length... indexed by phrase ID
47    private final int segmentations[][];
48    
49    // holds readings and POS, indexed by wordid
50    private final String data[];
51    
52    private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;
53    
54    public static final int WORD_COST = -100000;
55    
56    public static final int LEFT_ID = 5;
57    
58    public static final int RIGHT_ID = 5;
59  
60    public static UserDictionary open(Reader reader) throws IOException {
61  
62      BufferedReader br = new BufferedReader(reader);
63      String line = null;
64      List<String[]> featureEntries = new ArrayList<>();
65  
66      // text, segmentation, readings, POS
67      while ((line = br.readLine()) != null) {
68        // Remove comments
69        line = line.replaceAll("#.*$", "");
70  
71        // Skip empty lines or comment lines
72        if (line.trim().length() == 0) {
73          continue;
74        }
75        String[] values = CSVUtil.parse(line);
76        featureEntries.add(values);
77      }
78  
79      if (featureEntries.isEmpty()) {
80        return null;
81      } else {
82        return new UserDictionary(featureEntries);
83      }
84    }
85  
86    private UserDictionary(List<String[]> featureEntries) throws IOException {
87  
88      int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
89      // TODO: should we allow multiple segmentations per input 'phrase'?
90      // the old treemap didn't support this either, and i'm not sure if it's needed/useful?
91  
92      Collections.sort(featureEntries, new Comparator<String[]>() {
93        @Override
94        public int compare(String[] left, String[] right) {
95          return left[0].compareTo(right[0]);
96       }
97      });
98      
99      List<String> data = new ArrayList<>(featureEntries.size());
100     List<int[]> segmentations = new ArrayList<>(featureEntries.size());
101     
102     PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
103     Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
104     IntsRefBuilder scratch = new IntsRefBuilder();
105     long ord = 0;
106     
107     for (String[] values : featureEntries) {
108       String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
109       String[] readings = values[2].replaceAll("  *", " ").split(" ");
110       String pos = values[3];
111       
112       if (segmentation.length != readings.length) {
113         throw new RuntimeException("Illegal user dictionary entry " + values[0] +
114                                    " - the number of segmentations (" + segmentation.length + ")" +
115                                    " does not the match number of readings (" + readings.length + ")");
116       }
117       
118       int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
119       wordIdAndLength[0] = wordId;
120       for (int i = 0; i < segmentation.length; i++) {
121         wordIdAndLength[i + 1] = segmentation[i].length();
122         data.add(readings[i] + INTERNAL_SEPARATOR + pos);
123         wordId++;
124       }
125       // add mapping to FST
126       String token = values[0];
127       scratch.grow(token.length());
128       scratch.setLength(token.length());
129       for (int i = 0; i < token.length(); i++) {
130         scratch.setIntAt(i, (int) token.charAt(i));
131       }
132       fstBuilder.add(scratch.get(), ord);
133       segmentations.add(wordIdAndLength);
134       ord++;
135     }
136     this.fst = new TokenInfoFST(fstBuilder.finish(), false);
137     this.data = data.toArray(new String[data.size()]);
138     this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
139   }
140   
141   /**
142    * Lookup words in text
143    * @param chars text
144    * @param off offset into text
145    * @param len length of text
146    * @return array of {wordId, position, length}
147    */
148   public int[][] lookup(char[] chars, int off, int len) throws IOException {
149     // TODO: can we avoid this treemap/toIndexArray?
150     TreeMap<Integer, int[]> result = new TreeMap<>(); // index, [length, length...]
151     boolean found = false; // true if we found any results
152 
153     final FST.BytesReader fstReader = fst.getBytesReader();
154 
155     FST.Arc<Long> arc = new FST.Arc<>();
156     int end = off + len;
157     for (int startOffset = off; startOffset < end; startOffset++) {
158       arc = fst.getFirstArc(arc);
159       int output = 0;
160       int remaining = end - startOffset;
161       for (int i = 0; i < remaining; i++) {
162         int ch = chars[startOffset+i];
163         if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
164           break; // continue to next position
165         }
166         output += arc.output.intValue();
167         if (arc.isFinal()) {
168           final int finalOutput = output + arc.nextFinalOutput.intValue();
169           result.put(startOffset-off, segmentations[finalOutput]);
170           found = true;
171         }
172       }
173     }
174     
175     return found ? toIndexArray(result) : EMPTY_RESULT;
176   }
177   
178   public TokenInfoFST getFST() {
179     return fst;
180   }
181 
182   private static final int[][] EMPTY_RESULT = new int[0][];
183   
184   /**
185    * Convert Map of index and wordIdAndLength to array of {wordId, index, length}
186    * @return array of {wordId, index, length}
187    */
188   private int[][] toIndexArray(Map<Integer, int[]> input) {
189     ArrayList<int[]> result = new ArrayList<>();
190     for (int i : input.keySet()) {
191       int[] wordIdAndLength = input.get(i);
192       int wordId = wordIdAndLength[0];
193       // convert length to index
194       int current = i;
195       for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
196         int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
197         result.add(token);
198         current += wordIdAndLength[j];
199       }
200     }
201     return result.toArray(new int[result.size()][]);
202   }
203 
204   public int[] lookupSegmentation(int phraseID) {
205     return segmentations[phraseID];
206   }
207   
208   @Override
209   public int getLeftId(int wordId) {
210     return LEFT_ID;
211   }
212   
213   @Override
214   public int getRightId(int wordId) {
215     return RIGHT_ID;
216   }
217   
218   @Override
219   public int getWordCost(int wordId) {
220     return WORD_COST;
221   }
222   
223   @Override
224   public String getReading(int wordId, char surface[], int off, int len) {
225     return getFeature(wordId, 0);
226   }
227   
228   @Override
229   public String getPartOfSpeech(int wordId) {
230     return getFeature(wordId, 1);
231   }
232   
233   @Override
234   public String getBaseForm(int wordId, char surface[], int off, int len) {
235     return null; // TODO: add support?
236   }
237   
238   @Override
239   public String getPronunciation(int wordId, char surface[], int off, int len) {
240     return null; // TODO: add support?
241   }
242   
243   @Override
244   public String getInflectionType(int wordId) {
245     return null; // TODO: add support?
246   }
247 
248   @Override
249   public String getInflectionForm(int wordId) {
250     return null; // TODO: add support?
251   }
252   
253   private String[] getAllFeaturesArray(int wordId) {
254     String allFeatures = data[wordId-CUSTOM_DICTIONARY_WORD_ID_OFFSET];
255     if(allFeatures == null) {
256       return null;
257     }
258     
259     return allFeatures.split(INTERNAL_SEPARATOR);
260   }
261   
262   
263   private String getFeature(int wordId, int... fields) {
264     String[] allFeatures = getAllFeaturesArray(wordId);
265     if (allFeatures == null) {
266       return null;
267     }
268     StringBuilder sb = new StringBuilder();
269     if (fields.length == 0) { // All features
270       for (String feature : allFeatures) {
271         sb.append(CSVUtil.quoteEscape(feature)).append(",");
272       }
273     } else if (fields.length == 1) { // One feature doesn't need to escape value
274       sb.append(allFeatures[fields[0]]).append(",");
275     } else {
276       for (int field : fields){
277         sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
278       }
279     }
280     return sb.deleteCharAt(sb.length() - 1).toString();
281   }
282   
283 }